import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
import sqlalchemy
import pandasql as ps
from pandasql import sqldf
mysql = lambda q: sqldf(q, globals())
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
%matplotlib inline
from wordcloud import WordCloud
import plotly.graph_objects as go
df=pd.read_csv('XYZ Limited Sales Analysis.csv',encoding='latin1')
df.head()
| Row ID | Order ID | Year | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2013-152156 | 2013 | 11/9/2013 | 11/12/2013 | Second Class | CG-12520 | Claire Gute | Consumer | United States | ... | 42420 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | 2 | CA-2013-152156 | 2013 | 11/9/2013 | 11/12/2013 | Second Class | CG-12520 | Claire Gute | Consumer | United States | ... | 42420 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | 3 | CA-2013-138688 | 2013 | 6/13/2013 | 6/17/2013 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | ... | 90036 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | 4 | US-2012-108966 | 2012 | 10/11/2012 | 10/18/2012 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | ... | 33311 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | 5 | US-2012-108966 | 2012 | 10/11/2012 | 10/18/2012 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | ... | 33311 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
5 rows × 22 columns
df['Year'].unique()
array([2013, 2012, 2011, 2014], dtype=int64)
df['Ship Mode'].unique()
array(['Second Class', 'Standard Class', 'First Class', 'Same Day'],
dtype=object)
df['Category'].unique()
array(['Furniture', 'Office Supplies', 'Technology'], dtype=object)
df['Sub-Category'].unique()
array(['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage',
'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper',
'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines',
'Copiers'], dtype=object)
Propotion of Sales and Profit by Region¶
region_sales=df.groupby('Region')['Sales'].sum().reset_index()
fig=px.pie(region_sales,values='Sales',names='Region',
title='Propotion of Sales by Region',hole=0.5,
color_discrete_sequence=px.colors.qualitative.Plotly)
fig.show()
region_profit=df.groupby('Region')['Profit'].sum().reset_index()
fig=px.pie(region_profit,values='Profit',names='Region',
title='Propotion of Profit by Region',hole=0.5,
color_discrete_sequence=px.colors.qualitative.Plotly)
fig.show()
q2 = """
SELECT Region, SUM(Sales) AS Total_Sales, SUM(Profit) AS Total_Profit, (SUM(Profit) * 100.0 / SUM(Sales)) AS Profit_Percentage
FROM df
GROUP BY Region
"""
df_2=mysql(q2)
fig = px.bar(df_2, x='Region', y='Profit_Percentage',
title='Profit Margin by Region',
labels={'Profit_Percentage':'Profit Margin (%)'},
text='Profit_Percentage')
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.show()
Insights :¶
West and East Regions having about 61% Sales compared to South and Central (39%) but the contribution of profit from West and East are about 70% showing the company is doing well in the East and West region compared to South and Central(30%) and need it expand its business inthese regions.
Sales and Profit by Segment¶
Segment_sales=df.groupby('Segment')['Sales'].sum().reset_index()
fig=px.pie(Segment_sales,values='Sales',names='Segment',
title='Propotion of Sales by Segment',hole=0.5,
color_discrete_sequence=px.colors.qualitative.Safe)
fig.show()
Segment_profit=df.groupby('Segment')['Profit'].sum().reset_index()
fig=px.pie(Segment_profit,values='Profit',names='Segment',
title='Propotion of Profit by Segment',hole=0.5,
color_discrete_sequence=px.colors.qualitative.Safe)
fig.show()
q3 = """
SELECT Segment, SUM(Sales) AS Total_Sales, SUM(Profit) AS Total_Profit, (SUM(Profit) * 100.0 / SUM(Sales)) AS Profit_Percentage
FROM df
GROUP BY Segment
"""
df_1=mysql(q3)
fig = px.bar(df_1, x='Segment', y='Profit_Percentage',
title='Profit Margin by Segment',
labels={'Profit_Percentage':'Profit Margin (%)'},
text='Profit_Percentage')
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.show()
Insights :¶
The larger amount of sales of about 51% and contibution of Profit of about 46% is coming from Consumer Segment followed by Corporate(30% Sales and 32% profit contribution) and Home office as least contributing Segment
Sales and Profit by Category¶
category_sales=df.groupby('Category')['Sales'].sum().reset_index()
fig=px.pie(category_sales,values='Sales',names='Category',
title='Propotion of Sales by Category',hole=0.5,
color_discrete_sequence=px.colors.qualitative.Alphabet)
fig.show()
category_profit=df.groupby('Category')['Profit'].sum().reset_index()
fig=px.pie(category_profit,values='Profit',names='Category',
title='Propotion of Profit by Category',hole=0.5,
color_discrete_sequence=px.colors.qualitative.Alphabet)
fig.show()
q2 = """
SELECT Category, SUM(Sales) AS Total_Sales, SUM(Profit) AS Total_Profit, (SUM(Profit) * 100.0 / SUM(Sales)) AS Profit_Percentage
FROM df
GROUP BY Category
"""
df_2=mysql(q2)
fig = px.bar(df_2, x='Category', y='Profit_Percentage',
title='Profit Margin by Category',
labels={'Profit_Percentage':'Profit Margin (%)'},
text='Profit_Percentage')
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.show()
Insights :¶
Although the Sales of Company by different categories Technology(36%),Furniture(32%) and Office Supplies(31%) are equally distributed,The profit contribution varies largely especially in Furnitures.It contributes only 6.5 of Overall Profit which is small compared to its Sales(32%).Technology Profit margins are very good as it contributes 50% of Profit followed by Office Supplies(43%).
Order Count By Ship Mode¶
order_count_by_ship_mode = df['Ship Mode'].value_counts().reset_index()
order_count_by_ship_mode.columns = ['Ship Mode', 'Order Count']
fig = px.bar(order_count_by_ship_mode, x='Ship Mode', y='Order Count', title='Order Count by Ship Mode')
fig.show()
Insights:¶
From the above bar chart we can clearly see maximum orders were of Ship Mode Standard Class follwed by Second Class,First Class and Same Day with least number of order count.
Top 10 Product by Sales and Profit¶
top_products = df.groupby('Product Name')['Sales'].sum().nlargest(10).reset_index()
top_products = top_products.sort_values(by='Sales', ascending=True)
fig = px.bar(top_products, y='Product Name', x='Sales', title='Top 10 Products by Sales')
fig.show()
top_products = df.groupby('Product Name')['Profit'].sum().nlargest(10).reset_index()
top_products = top_products.sort_values(by='Profit', ascending=True)
fig = px.bar(top_products, y='Product Name', x='Profit', title='Top 10 Products by Profit')
fig.show()
Bottom 10 Product by Sales and Profit¶
bottom_products = df.groupby('Product Name')['Sales'].sum().nsmallest(10).reset_index()
bottom_products = bottom_products.sort_values(by='Sales', ascending=False)
fig = px.bar(bottom_products, y='Product Name', x='Sales', title='Bottom 10 Products by Sales')
fig.show()
bottom_products = df.groupby('Product Name')['Profit'].sum().nsmallest(10).reset_index()
bottom_products = bottom_products.sort_values(by='Profit', ascending=False)
fig = px.bar(bottom_products, y='Product Name', x='Profit', title='Bottom 10 Products by Profit',
labels={'Profit':'Profit', 'Product Name':'Product Name'},
orientation='h')
fig.show()
Sales and Profit by Year¶
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Year'] = df['Order Date'].dt.year
df['Quarter'] = df['Order Date'].dt.quarter
sales_profit_by_quarter = df.groupby(['Year', 'Quarter'])[['Sales', 'Profit']].sum().reset_index()
x_labels = []
for year in sales_profit_by_quarter['Year'].unique():
year_quarters = sales_profit_by_quarter[sales_profit_by_quarter['Year'] == year]['Quarter']
year_labels = [f"Q{q}-{year}" for q in year_quarters]
x_labels.extend(year_labels)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=x_labels,
y=sales_profit_by_quarter['Sales'],
mode='lines+markers',
name='Sales'
)
)
fig.add_trace(
go.Scatter(
x=x_labels,
y=sales_profit_by_quarter['Profit'],
mode='lines+markers',
name='Profit'
)
)
fig.update_layout(
title='Sales and Profit by Quarter and Year',
xaxis_title='Quarter and Year',
yaxis_title='Amount',
xaxis=dict(
tickmode='array',
tickvals=x_labels,
ticktext=x_labels,
tickangle=-45
),
yaxis=dict(title='Amount'),
yaxis2=dict(title='Amount', overlaying='y', side='right')
)
fig.show()
Insights:¶
We can see that there is steady growth of sales at the end of each year and the Sales are comparatively low in Quarter 1 of each year and also slight growth in profit also.
Impact of Profit by Discount¶
query4= """
SELECT Sales, Quantity, Discount, Profit
FROM df;
"""
df4=mysql(query4)
sns.pairplot(df4, palette='Set2', diag_kind='hist', markers=['o', 's', 'D', '^'])
plt.suptitle('Pairplot of Profit, Sales, Quantity, and Discount by Segment', y=1.02)
plt.tight_layout()
plt.show()
query4= """
SELECT Sales, Quantity, Discount, Profit
FROM df;
"""
df4=mysql(query4)
corr_matrix=df4.corr()
fig = px.imshow(corr_matrix.values,
labels=dict(x='Variables', y='Variables', color='Correlation'),
x=corr_matrix.index,
y=corr_matrix.columns,
color_continuous_scale='Viridis',
zmin=-1, zmax=1,
title='Correlation Heatmap of Sales, Quantity, Discount, and Profit')
annotations = []
for i, row in enumerate(corr_matrix.index):
for j, col in enumerate(corr_matrix.columns):
annotations.append(dict(
text=f'{corr_matrix.iloc[i, j]:.2f}',
x=col, y=row,
xref='x1', yref='y1',
font=dict(color='white' if abs(corr_matrix.iloc[i, j]) > 0.5 else 'black'),
showarrow=False)
)
fig.update_layout(
annotations=annotations,
width=800, height=600
)
fig.show()
Insights:¶
The heatmap and pairplot shows there is no imapct of Profit and Sales by discount as they show negative correlation(-.22,-.03).The company should focus on giving more discounts to see if there is any improvement in Sales.
Sales by Category and Sub-Category¶
q2 = """
SELECT Category, "Sub-Category" , SUM(Sales) AS Total_Sales
FROM df
GROUP BY Category, "Sub-Category";
"""
df5=mysql(q2)
fig = px.treemap(df5, path=['Category', 'Sub-Category'], values='Total_Sales',
color='Total_Sales', color_continuous_scale='RdBu',
title='Treemap of Sales by Category and Sub-Category')
fig.show()
Insights:¶
The above Tree Map shows the Sub Categories Sales where Phones are the most sold in Technology,Chairs in Furniture,Storage in Office Supplies
Distribution of Sales by Region¶
q3 = """
SELECT Region, Sales
FROM df;
"""
df7=mysql(q3)
fig = px.box(df7, x='Region', y='Sales', title='Box Plot of Sales by Region',
labels={'Sales': 'Sales Amount', 'Region': 'Region'})
fig.show()
Insights:¶
The Box Plot shows the distribution of Sales by Region.We can observe that there is a big Outlier of sales 22k in South ,14k in West and 17k in Central whereas there is no significant Outlier in East Region.
fig = px.scatter(df, x='Sales',y='Profit',size='Quantity',
title='Bubble Chart of Sales vs Profit with Quantity',
labels={'Sales': 'Sales Amount', 'Profit': 'Profit Amount', 'Quantity': 'Quantity'},
size_max=60)
fig.show()
q4 = """
SELECT "Customer Name", "Product ID","Product Name",Segment,Category,Region,Quantity,Sales,Profit
FROM df
WHERE Sales > 15000;
"""
Outlier_df=mysql(q4)
Outlier_df.head()
| Customer Name | Product ID | Product Name | Segment | Category | Region | Quantity | Sales | Profit | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Sean Miller | TEC-MA-10002412 | Cisco TelePresence System EX90 Videoconferenci... | Home Office | Technology | South | 6 | 22638.48 | -1811.0784 |
| 1 | Tamara Chand | TEC-CO-10004722 | Canon imageCLASS 2200 Advanced Copier | Corporate | Technology | Central | 5 | 17499.95 | 8399.9760 |
Insights:¶
Most sales and profit values are between 0-10k, indicating the company standard performance range.The outliers at 17k shows a profit of 8.4k and at 22k shows a loss of 1.8k.
WordCloud Most Selling Product Names¶
q5 = """
SELECT "Product Name"
FROM df;
"""
df8=mysql(q5)
text_data = ' '.join(df8['Product Name'].dropna().tolist() )
text_data = text_data.lower()
text_data = re.sub(r'[^a-z\s]', '', text_data)
tokens = word_tokenize(text_data)
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
term_freq = Counter(tokens)
print(term_freq.most_common(10))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(term_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Terms in Product Names',
fontdict={'fontsize': 18, 'fontweight': 'bold', 'color': 'Red'})
plt.show()
[('xerox', 865), ('x', 782), ('avery', 557), ('binders', 524), ('chair', 479), ('black', 426), ('phone', 374), ('gbc', 342), ('binder', 340), ('binding', 333)]
Insights:¶
The WordCloud shows most selling product names Xerox,chair,binders,avery,phone are the most selling products.
WaterFall Chart for Sales and Profit by Year¶
df11 = df.groupby('Year').agg({'Profit': 'sum', 'Sales': 'sum'}).reset_index()
df11['cumulative_profit'] = df11['Profit'].cumsum()
df11['cumulative_sales'] = df11['Sales'].cumsum()
fig = go.Figure()
fig.add_trace(go.Waterfall(name="Sales",orientation="v",
measure=["relative"] * len(df11),
x=df11['Year'],textposition="outside",
y=df11['cumulative_sales'],
connector={"line":{"color":"rgb(63, 63, 63)"}},
text= [f"Sales={Sales}" for Sales in df11['Sales']],
base=0))
fig.add_trace(go.Waterfall(name="Profit",orientation="v",
measure=["relative"] * len(df11),
x=df11['Year'],textposition="outside",
y=df11['cumulative_profit'],
increasing=dict(marker=dict(color='blue')),
decreasing=dict(marker=dict(color='blue')),
connector={"line":{"color":"rgb(63, 63, 63)"}},
text=[f"Profit={profit}" for profit in df11['Profit']],
base=0 ))
fig.update_layout(title="Waterfall Chart of Profit and Sales by Year",
showlegend=True,legend=dict(x=0.7, y=0.95),yaxis=dict(title='Amount'),
xaxis=dict(title='Year'),)
fig.show()
Insights:¶
The waterfall chart shows that there is no decrease in sales and profit by year and shows steady growth which is good factor for company.
Overall Insights¶
Regional Performance:¶
West and East Regions lead in both sales (61%) and profit contribution (70%), showing strong performance. South and Central Regions show lower profits(30%) compared to their sales (39%), indicating need for improvement.
Segment Performance:¶
Consumer Segment has the highest sales (51%) with a good profit contribution (46%). Corporate Segment follows with 30% of sales and 32% of profit. Home Office has the least impact on both sales and profit.
Category Performance:¶
Sales are equally distributed among Technology (36%), Furniture (32%), and Office Supplies (31%). Technology has a high profit contribution (50%), while Furniture shows significantly less profit with only 6.5% of overall profit compared to its sales share.
Impact of Discounts:¶
Discounts show a negative correlation (-0.22) with profit and sales.
Product Insights:¶
Top selling products are Phones, Chairs, and Storage across different sub-categories.
Sales Distribution by Region:¶
Outliers in sales are observed in South, West and Central regions, indicating areas of both high and low performance.
Overall Growth:¶
Stable growth indicated in sales and profit over the years.
Strategic Recommendations:¶
The Company should focus on expanding operations in profitable regions like West and East.
Improve discount strategies to balance sales growth with profitability.
Focus on improving profitability in the Furniture category.